Executive Summary

This report presents the results of our birth country imputation project for 48,031 records, with 18,814 missing birth countries successfully imputed using multiple reference sources and matching strategies.

Key Results

# Calculate key metrics
total_imputed <- sum(imputation_summary$n[imputation_summary$imp_type != "given"])
success_rate <- round((total_imputed / n_missing) * 100, 1)
methods_used <- nrow(imputation_summary) - 1

# Create summary box
summary_stats <- data.frame(
  Metric = c("Total Records", "Missing Birth Countries", "Successfully Imputed", 
             "Success Rate", "Imputation Methods"),
  Value = c(formatC(n_total, format="d", big.mark=","),
            formatC(n_missing, format="d", big.mark=","),
            formatC(total_imputed, format="d", big.mark=","),
            paste0(success_rate, "%"),
            methods_used)
)

summary_stats %>%
  gt() %>%
  tab_header(title = "Imputation Summary Statistics") %>%
  cols_align(align = "center", columns = Value) %>%
  tab_style(
    style = list(cell_fill(color = "#E8F4FD")),
    locations = cells_body(rows = 4)  # Highlight success rate
  )
Imputation Summary Statistics
Metric Value
Total Records 48,031
Missing Birth Countries 18,814
Successfully Imputed 18,808
Success Rate 100%
Imputation Methods 26

Multi-classed imputation

After solving multi-classed imputation we could improve algorithm in detecting unique country of birth.

Comparinison

# Static data
imp_birth_country_before <- data.frame(
  country_code = c('000/152', '000/151', '000/160', '149/152', '000/124', 
                   '000/142', '000/148', '000/368', '000/438', '000/451'),
  n = c(169, 13, 12, 12, 2, 2, 2, 2, 2, 2)
)

imp_birth_country_after <- data.frame(
  country_code = c('000/151', '000/152', '000/160', '149/152', '000/124', 
                   '000/142', '000/148', '000/368', '000/438', '000/451'),
  n = c(11, 7, 5, 4, 2, 2, 2, 2, 2, 2)
)
# Combine data
combined <- full_join(imp_birth_country_before, imp_birth_country_after, by = "country_code", suffix = c("_before", "_after")) %>%
  replace_na(list(n_before = 0, n_after = 0)) %>%
  pivot_longer(cols = c(n_before, n_after), names_to = "group", values_to = "count")

# Plot horizontal bars
ggplot(combined, aes(x = country_code, y = count, fill = group)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  scale_fill_manual(values = c("n_before" = "skyblue", "n_after" = "salmon"), 
                    labels = c("Before", "After")) +
  labs(x = "Country Code", y = "Count", fill = "Group", 
       title = "Before vs After Multiclass Imputation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1)) +
  coord_flip()

Imputation Methods Performance

Success Rate by Method

library(shiny)
library(dplyr)
library(ggplot2)
library(forcats)
library(plotly)
library(RColorBrewer)  # or viridis if installed

# Assume imputation_summary is already defined and n_missing calculated
n_missing <- sum(imputation_summary$n)

ui <- fluidPage(
  titlePanel("Imputation Performance Explorer"),
  
  sidebarLayout(
    sidebarPanel(
      sliderInput("threshold",
                  "Minimum Records to Display:",
                  min = 0,
                  max = 300,
                  value = 100,
                  step = 20)
    ),
    
    mainPanel(
      plotlyOutput("perf_plot")
    )
  )
)

server <- function(input, output, session) {
  
  output$perf_plot <- renderPlotly({
    # Filter dynamically based on slider
    perf_data <- imputation_summary %>%
      filter(imp_type != "given") %>%
      filter(n > input$threshold) %>%   # dynamic threshold
      mutate(
        pct = round(n / n_missing * 100, 1),
        imp_type = fct_reorder(imp_type, n)
      )
    
    perf_plot <- ggplot(perf_data,
                        aes(x = imp_type, y = n, fill = imp_type,
                            text = paste0("Method: ", description,
                                          "<br>Records: ", formatC(n, format="d", big.mark=","),
                                          "<br>% of Missing: ", pct, "%"))) +
      geom_col() +
      coord_flip() +
      scale_fill_brewer(palette = "Set2") +  # built-in palette
      labs(
        title = "Records Imputed by Method",
        x = "Imputation Method",
        y = "Number of Records",
        caption = "Hover for details"
      ) +
      theme_minimal() +
      theme(legend.position = "none")
    
    ggplotly(perf_plot, tooltip = "text")
  })
}

shinyApp(ui, server)
Shiny applications not supported in static R Markdown documents

Method Details

imputation_summary %>%
  filter(imp_type != "given") %>%
  mutate(
    pct_missing = round(n / n_missing * 100, 1),
    pct_total = round(n / n_total * 100, 1)
  ) %>%
  select(Method = imp_type, Description = description,
         Records = n, `% of Missing` = pct_missing, `% of Total` = pct_total) %>%
  datatable(
    options = list(
      pageLength = 15,
      dom = 'Bfrtip',
      scrollX = TRUE
    ),
    caption = "Detailed breakdown of imputation methods (click column headers to sort)"
  ) %>%
  formatStyle(
    'Records',
    background = styleColorBar(range(imputation_summary$n), 'lightblue'),
    backgroundSize = '100% 90%',
    backgroundRepeat = 'no-repeat',
    backgroundPosition = 'center'
  )

Country Distribution Results

Imputed Birth Countries

# Interactive pie chart
pie_plot <- country_dist %>%
  plot_ly(labels = ~country, values = ~n, type = 'pie',
          textposition = 'inside',
          textinfo = 'label+percent',
          hovertemplate = paste('<b>%{label}</b><br>',
                               'Records: %{value:,}<br>',
                               'Percentage: %{percent}<br>',
                               '<extra></extra>'),
          marker = list(colors = RColorBrewer::brewer.pal(8, "Set2"))) %>%
  layout(title = "Distribution of Imputed Birth Countries",
         showlegend = TRUE)

pie_plot

Interactive Data Explorer

library(dplyr)
library(DT)


sample_results <- data_filled_df %>%
  # Optional: take a sample of 1000 rows if your dataset is large
  slice_sample(n = min(1000, nrow(data_filled_df))) %>%
  # Join readable description for imp_type
  left_join(imputation_summary %>% select(imp_type, description), by = "imp_type") %>%
  # Select & rename columns for display
  select(
    ID = pid,
    `Birth City` = birth_city,
    `Imputed Country` = imp_name,
    `Method Used` = description,
    `Citizenship 1` = citizenship_1,
    `Citizenship 2` = citizenship_2
  )

# 3. Display interactive table
sample_results %>%
  datatable(
    filter = 'top',
    options = list(
      pageLength = 25,
      scrollX = TRUE,      # allow horizontal scroll
      autoWidth = TRUE,    # adjust column widths automatically
      dom = 'Bfrtip'
    ),
    caption = "Sample of imputation results - Use filters above columns to explore patterns"
  )
# 
# # Create sample of final results for exploration
# # Replace this with your actual data_filled_df
# sample_results <- data.frame(
#   pid = 1:1000,
#   birth_city = sample(c("BERLIN", "HAMBURG", "ISTANBUL", "WARSZAWA", "ROMA", "PARIS", "LONDON", NA), 1000, replace = TRUE),
#   imp_birth_country = sample(c("000", "152", "163", "380", "826"), 1000, replace = TRUE),
#   imp_name = sample(c("Germany", "Poland", "Turkey", "Italy", "United Kingdom"), 1000, replace = TRUE),
#   imp_type = sample(imputation_summary$imp_type, 1000, replace = TRUE, prob = imputation_summary$n),
#   citizenship_1 = sample(c("000", "152", "163", "380", "826", NA), 1000, replace = TRUE),
#   citizenship_2 = sample(c("000", "152", "163", "380", "826", NA), 1000, replace = TRUE)
# )
# 
# sample_results %>%
#   select(ID = pid, `Birth City` = birth_city, `Imputed Country` = imp_name,
#          `Method Used` = imp_type, `Citizenship 1` = citizenship_1, `Citizenship 2` = citizenship_2) %>%
#   datatable(
#     filter = 'top',
#     options = list(
#       pageLength = 25,
#       scrollX = TRUE,
#       dom = 'Bfrtip'
#     ),
#     caption = "Sample of imputation results - Use filters above columns to explore patterns"
#   )

Quality Assessment

Citizenship Validation

library(ggplot2)
library(plotly)
library(stringr)

validation_plot <- validation_data %>%
  mutate(Category = str_wrap(Category, 25)) %>%
  ggplot(aes(x = reorder(Category, Count), y = Count, fill = Category,
             text = paste0("Category: ", Category,
                           "<br>Count: ", formatC(Count, format="d", big.mark=","),
                           "<br>Percentage: ", Percentage, "%"))) +
  geom_col() +
  coord_flip() +
  scale_fill_brewer(type = "qual", palette = "Set2") +
  labs(title = "Validation: Birth Country vs Citizenship Consistency",
       x = "",
       y = "Number of Records") +
  theme_minimal() +
  theme(legend.position = "none")

ggplotly(validation_plot, tooltip = "text")

Methodology Summary

Our imputation strategy employed a hierarchical approach:

  1. German Cities: Matched against official municipal directories
  2. World Cities: Used global cities database with population weighting
  3. Historical Territories: Special handling for former German territories (Ostgebiete), German exonyms and german names of polish cities.
  4. Literal Country Names: Direct country name parsing from parentheticals
  5. City Patterns: Statistical assignment based on non-missing observations
  6. Citizenship Fallback: Used declared citizenship when geographical methods failed

Data Sources

  • German Cities: Statistikportal.de municipal directory (31,000+ entries)
  • World Cities: SimpleMaps world cities database (40,000+ cities)
  • Country Codes: DESTATIS official country classification
  • Historical: Manual curation of former German territories, Wikipedia data with germans and polish names

Conclusions

  • Successfully imputed 18 808 rows, success_rate 100% of missing birth countries
  • German-focused approach appropriate for the dataset demographics
  • Multi-source strategy provided robust fallback options
  • Citizenship validation shows reasonable consistency patterns

This report was generated using R Markdown with interactive elements. All charts are interactive - hover for details and use table filters to explore the data.